cv_states <- as.data.frame(read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
state_pops <- as.data.frame(read.csv("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
cv_states <- merge(cv_states, state_pops, by = "state")Lab_11
1. Reading and Processing Data
2. Look at the data
dim(cv_states)[1] 58094 9
head(cv_states) state date fips cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04 1 1587224 21263 1 4887871 96.50939 AL
2 Alabama 2020-04-25 1 6213 213 1 4887871 96.50939 AL
3 Alabama 2023-02-26 1 1638348 21400 1 4887871 96.50939 AL
4 Alabama 2022-12-03 1 1549285 21129 1 4887871 96.50939 AL
5 Alabama 2020-05-06 1 8691 343 1 4887871 96.50939 AL
6 Alabama 2021-04-21 1 524367 10807 1 4887871 96.50939 AL
tail(cv_states) state date fips cases deaths geo_id population pop_density abb
58089 Wyoming 2022-09-11 56 175290 1884 56 577737 5.950611 WY
58090 Wyoming 2022-08-21 56 173487 1871 56 577737 5.950611 WY
58091 Wyoming 2021-01-26 56 51152 596 56 577737 5.950611 WY
58092 Wyoming 2021-02-21 56 53795 662 56 577737 5.950611 WY
58093 Wyoming 2021-08-22 56 70671 809 56 577737 5.950611 WY
58094 Wyoming 2021-03-20 56 55581 693 56 577737 5.950611 WY
str(cv_states)'data.frame': 58094 obs. of 9 variables:
$ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
$ date : chr "2023-01-04" "2020-04-25" "2023-02-26" "2022-12-03" ...
$ fips : int 1 1 1 1 1 1 1 1 1 1 ...
$ cases : int 1587224 6213 1638348 1549285 8691 524367 1321892 1088370 1153149 814025 ...
$ deaths : int 21263 213 21400 21129 343 10807 19676 16756 16826 15179 ...
$ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
$ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
$ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
$ abb : chr "AL" "AL" "AL" "AL" ...
3. Format the data
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
cv_states <- cv_states[order(cv_states$state, cv_states$date),]
str(cv_states)'data.frame': 58094 obs. of 9 variables:
$ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
$ date : Date, format: "2020-03-13" "2020-03-14" ...
$ fips : int 1 1 1 1 1 1 1 1 1 1 ...
$ cases : int 6 12 23 29 39 51 78 106 131 157 ...
$ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
$ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
$ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
$ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
$ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states) state date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
597 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
282 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
12 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
266 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
78 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states) state date fips cases deaths geo_id population pop_density abb
57902 Wyoming 2023-03-18 56 185640 2009 56 577737 5.950611 WY
57916 Wyoming 2023-03-19 56 185640 2009 56 577737 5.950611 WY
57647 Wyoming 2023-03-20 56 185640 2009 56 577737 5.950611 WY
57867 Wyoming 2023-03-21 56 185800 2014 56 577737 5.950611 WY
58057 Wyoming 2023-03-22 56 185800 2014 56 577737 5.950611 WY
57812 Wyoming 2023-03-23 56 185800 2014 56 577737 5.950611 WY
summary(cv_states) state date fips cases
Washington : 1158 Min. :2020-01-21 Min. : 1.00 Min. : 1
Illinois : 1155 1st Qu.:2020-12-06 1st Qu.:16.00 1st Qu.: 112125
California : 1154 Median :2021-09-11 Median :29.00 Median : 418120
Arizona : 1153 Mean :2021-09-10 Mean :29.78 Mean : 947941
Massachusetts: 1147 3rd Qu.:2022-06-17 3rd Qu.:44.00 3rd Qu.: 1134318
Wisconsin : 1143 Max. :2023-03-23 Max. :72.00 Max. :12169158
(Other) :51184
deaths geo_id population pop_density
Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
1st Qu.: 1598 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
Median : 5901 Median :29.00 Median : 4468402 Median : 107.860
Mean : 12553 Mean :29.78 Mean : 6397965 Mean : 423.031
3rd Qu.: 15952 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
Max. :104277 Max. :72.00 Max. :39557045 Max. :11490.120
NA's :1106
abb
WA : 1158
IL : 1155
CA : 1154
AZ : 1153
MA : 1147
WI : 1143
(Other):51184
min(cv_states$date)[1] "2020-01-21"
max(cv_states$date)[1] "2023-03-23"
4. Add new cases, new deaths, and correct outliers
library(plotly)Loading required package: ggplot2
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library(zoo)
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
for (i in 1:length(state_list)) {
cv_subset <- subset(cv_states, state == state_list[i])
cv_subset <- cv_subset[order(cv_subset$date),]
cv_subset$new_cases = cv_subset$cases[1]
cv_subset$new_deaths = cv_subset$deaths[1]
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j - 1]
cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j - 1]
}
cv_states$new_cases[cv_states$state == state_list[i]] = cv_subset$new_cases
cv_states$new_deaths[cv_states$state == state_list[i]] = cv_subset$new_deaths
}
cv_states <- cv_states %>% dplyr::filter(date >= "2021-06-01")
p1 <- ggplot(cv_states, aes(x = date, y = new_cases, color = state)) +
geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)p1 <- NULL
p2 <- ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)p2 <- NULL
cv_states$new_cases[cv_states$new_cases < 0] = 0
cv_states$new_deaths[cv_states$new_deaths < 0] = 0
for (i in 1:length(state_list)) {
cv_subset <- subset(cv_states, state == state_list[i])
cv_subset$cases[1] = cv_subset$new_cases[1]
cv_subset$deaths[1] = cv_subset$new_deaths[1]
for (j in 2:nrow(cv_subset)) {
cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j - 1]
cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j - 1]
}
cv_states$cases[cv_states$state == state_list[i]] = cv_subset$cases
cv_states$deaths[cv_states$state == state_list[i]] = cv_subset$deaths
}
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k = 7, fill = NA, align = 'right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k = 7, fill = NA, align = 'right') %>% round(digits = 0)
p2 <- ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)5. Add additional variables
# add population-normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases / (cv_states$population / 100000), 1), nsmall = 1))Warning: NAs introduced by coercion
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths / (cv_states$population / 100000), 1), nsmall = 1))Warning: NAs introduced by coercion
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths * 100 / cases), 2))
# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date == max(cv_states$date))6. Explore scatterplots using plotly
library("plotly")
cv_states_today_filter <- cv_states_today %>% filter(state != "District of Columbia")
# Continue with the plot_ly code
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
hoverinfo = 'text',
text = ~paste(paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep=""),
paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
hovermode = "compare")Warning: Ignoring 1 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
7. Explore scatterplot trend interactively using ggplotly and geom_smooth
cv_states_today_scatter <- cv_states_today_filter %>%
select(pop_density, deathsper100k, population)
# Continue with the ggplotly() code
p <- ggplot(cv_states_today_scatter, aes(x=pop_density, y=deathsper100k, size=population)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add linear regression line
labs(title = "Population Density vs. New Deaths per 100k",
x = "Population Density",
y = "New Deaths per 100k") +
theme_minimal()
# Convert ggplot to interactive plot using ggplotly()
p <- ggplotly(p)`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: The following aesthetics were dropped during statistical transformation: size
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
p8. Multiple Line Chart
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>%
filter(state == "Florida") %>%
plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines", name = "New Cases") %>%
add_lines(y = ~new_deaths, name = "New Deaths", opacity = 0.7, text = ~paste("Deaths: ", new_deaths)) %>%
layout(title = "New Cases and Deaths Over Time in Florida",
xaxis = list(title = "Date"),
yaxis = list(title = "Count"),
hovermode = "compare")9. Heatmaps
# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>%
select(state, date, new_cases) %>%
filter(date > as.Date("2021-06-01"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
heatmap_plot <- plot_ly(
x = colnames(cv_states_mat2),
y = rownames(cv_states_mat2),
z = ~cv_states_mat2,
type = "heatmap",
showscale = TRUE
)
heatmap_plot# Repeat with newper100k
cv_states_mat <- cv_states %>%
select(state, date, newper100k) %>%
filter(date > as.Date("2021-06-01"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
heatmap_plot_newper100k <- plot_ly(
x = colnames(cv_states_mat2),
y = rownames(cv_states_mat2),
z = ~cv_states_mat2,
type = "heatmap",
showscale = TRUE
)
heatmap_plot_newper100k# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by = "2 weeks")
cv_states_mat <- cv_states %>%
select(state, date, newper100k) %>%
filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
heatmap_plot_filtered <- plot_ly(
x = colnames(cv_states_mat2),
y = rownames(cv_states_mat2),
z = ~cv_states_mat2,)10. Map
# For specified date
pick.date <- "2021-10-15"
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>%
filter(date == pick.date) %>%
select(state, abb, naive_CFR, cases, deaths) %>%
mutate(hover = paste(state, '<br>', "Naive CFR: ", naive_CFR, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Create the map for specified date
fig_pick.date <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~naive_CFR, text = ~hover, locations = ~state,
color = ~naive_CFR, colors = 'Reds'
) %>%
colorbar(title = paste("Naive CFR as of ", pick.date))
fig_pick.date <- fig_pick.date %>% layout(
title = paste('Naive CFR by State as of ', pick.date, '<br>(Hover for value)'),
geo = set_map_details
)
#############
### Map for today's date
# Extract the data for each state by its abbreviation
cv_per100_today <- cv_states_today %>%
select(state, abb, naive_CFR, cases, deaths) %>%
mutate(hover = paste(state, '<br>', "Naive CFR: ", naive_CFR, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Create the map for today's date
fig_Today <- plot_geo(cv_per100_today, locationmode = 'USA-states') %>%
add_trace(
z = ~naive_CFR, text = ~hover, locations = ~state,
color = ~naive_CFR, colors = 'Reds'
) %>%
colorbar(title = paste("Naive CFR as of ", Sys.Date()))
fig_Today <- fig_Today %>% layout(
title = paste('Naive CFR by State as of', Sys.Date(), '<br>(Hover for value)'),
geo = set_map_details
)
### Plot together
subplot(fig_pick.date, fig_Today, nrows = 2, margin = 0.05)